#########
## Nature Human Behaviour 
## Leading Countries in Global Science Increasingly Receive More Citations than Other Countries Despite Doing Similar Research.
## https://doi.org/10.1038/s41562-022-01351-5
## Harvard Dataverse (Code and Metadata): https://doi.org/10.7910/DVN/WCOINR 
## Figure 2
## Data: Data_20210905
## Data: Data_20210528 (Only for English and Non-English Comparison)
#########

#.rs.restartR()
gc(reset=T)
rm(list = ls(all.names = TRUE))

#### Libraries -----------
library("dplyr")
library("ggplot2")
library("scales")

#### Set up Parameters -----------
minimum_year <- 1980
maximum_year <- 2017
Percentile_Cutoff_Criterion_ <- 0
Citation_Window_Criterion_ <- 5
Inflation_Criterion_ <- "Field_Deflated"
Journal_Censoring_Criterion_ <- "Since_1980"
Sample_Core_Countries_ <- c("United States","China","Germany","United Kingdom","Japan","Netherlands","Switzerland")

#### Functions ----------------------
Grand_SE <- function(x){return(sqrt(sum(x^2,na.rm=T)/length(x)^2))} #https://stats.stackexchange.com/questions/21104/calculate-average-of-a-set-numbers-with-reported-standard-errors

mean_center <- function(x){return(( (x-mean(x,na.rm=T))/sd(x,na.rm=T)))}

standard_error <- function(x){return(sd(x,na.rm=T)/sqrt(length(x)))}

##### MAG MetaData Information ------------------
MAG_Field_ID_df <- read.csv("INPUT_MAG_FieldIDs_Domain.csv") %>%
  dplyr::mutate(Domain = case_when(Domain=="Medicine" ~ "Biomedical, Behavioral,\nand Ecological Sciences",
                                   Domain=="Engineering and Computer Science" ~ "Engineering and\nComputational Sciences",
                                   Domain=="Social Sciences" ~ "Social\nSciences",
                                   Domain=="Life, Behavioral, and Ecological Sciences" ~ "Biomedical, Behavioral,\nand Ecological Sciences",
                                   Domain=="Physical and Mathematical Sciences" ~ "Physical and\nMathematical Sciences",
                                   TRUE ~ as.character(Domain))) %>%
  as.data.frame()


# Read in Regional Data | Data =================================


Regional_df <- read.csv("INPUT_R_Nation_to_Region_Core.csv") %>%
  dplyr::mutate(Region = case_when(Nation=="PUERTO_RICO" ~ "LATIN_AND_SOUTH_AMERICA",
                                   Nation=="CUBA" ~ "LATIN_AND_SOUTH_AMERICA",
                                   Nation=="GREENLAND" ~ "WESTERN_EUROPE",
                                   Nation=="FRENCH_POLYNESIA" ~ "OCEANIA",
                                   Nation=="BRUNEI_DARUSSALAM" ~ "SOUTH_EAST_ASIA",
                                   Nation=="COMOROS" ~ "SUB_AFRICA",
                                   Nation=="CABO_VERDE" ~ "SUB_AFRICA",
                                   Nation=="KIRIBATI" ~ "OCEANIA",
                                   Nation=="LAO_PDR" ~ "SOUTH_EAST_ASIA",
                                   Nation=="MOLDOVA" ~ "EASTERN_EUROPE_USSR",
                                   Nation=="MAURITANIA" ~ "MENA",
                                   Nation=="NEW_CALEDONIA" ~ "OCEANIA",
                                   Nation=="SAN_MARINO" ~ "WESTERN_EUROPE",
                                   Nation=="SAO_TOME_AND_PRINCIPE" ~ "SUB_AFRICA",
                                   Nation=="TUVALU" ~ "OCEANIA",
                                   Nation=="ST_VINCENT_AND_THE_GRENADINES" ~ "CARIBBEAN",
                                   Nation=="MAURITIUS" ~ "SUB_AFRICA",
                                   Nation=="RUSSIA" ~ "CENTRAL_ASIA",
                                   TRUE ~ as.character(Region))) %>%
  dplyr::mutate(Region = case_when(
    Region=="CARIBBEAN" ~ "Latin America\nand Caribbean",
    Region=="WESTERN_EUROPE" ~ "Europe",
    Region=="SUB_AFRICA" ~ "MENA and\nSub-Saharan Africa",
    Region=="EASTERN_EUROPE_USSR" ~ "Europe",
    Region=="EAST_ASIA" ~ "Asia",
    Region=="CENTRAL_ASIA" ~ "Asia",
    Region=="SOUTH_EAST_ASIA" ~ "Asia",
    Region=="LATIN_AND_SOUTH_AMERICA" ~ "Latin America\nand Caribbean",
    Region=="NORTH_AMERICA" ~ "US and Canada",
    Region=="MENA" ~ "MENA and\nSub-Saharan Africa",
    Region=="OCEANIA" ~ "Oceania"
  ))%>%
  as.data.frame() %>%
  dplyr::select("Nation","Region") 

# Plot 2 | Data =================================

Beta_df <- plyr::ldply(list.files(pattern="OUTPUT_Python_Field_MRQAP_Beta_KLD_Citation_Corpus_RAKE_and_GoogleAPI_EnglishOnly_*",
                                  full.names=T), 
                       read.csv) %>%
  # 2022 02 20 - Test Without Significance. 
  subset(Stars!="" &
           Year>=minimum_year &
           Year<=(maximum_year-Citation_Window_Criterion_)) %>%
  
  merge(.,subset(MAG_Field_ID_df,select=c("FieldID","Name")),by.x=c("Discipline"),by.y=c("FieldID"),how="left") %>%
  subset(Coefficients!="Intercept") %>%
  as.data.frame() %>%
  subset(Coefficients%in%c("Citations_ji"))%>%
  subset(Percentile_Cutoff %in% c(0,0.25,0.5,0.75)) %>%
  tidyr::drop_na()

KLD_Degree_Centrality_df <- plyr::ldply(list.files(pattern="OUTPUT_Python_Field_Degree_Centrality_KLD_Citation_Corpus_RAKE_and_GoogleAPI_EnglishOnly_*",
                                                   full.names=T), 
                                        read.csv) %>%
  dplyr::mutate(Country = stringr::str_to_title(stringr::str_replace(as.character(Country),"_"," "))) %>%
  as.data.frame()

# Plot 2 | Number of Countries | Data =================================

num_Countries_df <- KLD_Degree_Centrality_df %>%
  dplyr::select(Discipline,Year,Percentile_Cutoff,Citation_Type,Citation_Window,Journal_Censoring,Country)%>%
  dplyr::group_by(Discipline,Year,Percentile_Cutoff,Citation_Type,Citation_Window,Journal_Censoring)%>%
  dplyr::summarise(Number_of_Countries=n())%>%
  subset(Year>=minimum_year & Year<=maximum_year-Citation_Window_Criterion_) %>%
  tidyr::drop_na()%>%
  as.data.frame()
  
num_Countries_df_DOMAIN <- KLD_Degree_Centrality_df %>%
  dplyr::select(Discipline,Year,Percentile_Cutoff,Citation_Type,Citation_Window,Country,Journal_Censoring)%>%
  dplyr::group_by(Discipline,Year,Percentile_Cutoff,Citation_Type,Citation_Window,Journal_Censoring)%>%
  dplyr::summarise(Number_of_Countries=n())%>%
  merge(.,MAG_Field_ID_df %>%
          dplyr::rename(Discipline=FieldID) %>%
          dplyr::select(Discipline,Domain),
        by=c("Discipline")) %>%
  dplyr::group_by(Domain,Percentile_Cutoff,Citation_Type,Citation_Window,Journal_Censoring,Year) %>%
  dplyr::summarise(Number_of_Countries_Mean = mean(`Number_of_Countries`,na.rm=T),
                   Number_of_Countries_SE = sd(`Number_of_Countries`,na.rm=T)/sqrt(length(`Number_of_Countries`))) %>%
  subset(Year>=minimum_year & Year<=maximum_year-Citation_Window_Criterion_) %>%
  as.data.frame() %>%
  dplyr::rename(Number_of_Countries=Number_of_Countries_Mean)%>%
  tidyr::drop_na()%>%
  dplyr::mutate(Percentile_Cutoff_Labels = case_when(Percentile_Cutoff==0 ~ "All Inclusive",
                                                     Percentile_Cutoff==0.25 ~ "Above 25th\nPercentile",
                                                     Percentile_Cutoff==0.5 ~ "Above 50th\nPercentile",
                                                     Percentile_Cutoff==0.75 ~ "Above 75th\nPercentile"))%>%
  dplyr::mutate(Domain_Label = case_when(Domain=="Biomedical, Behavioral,\nand Ecological Sciences" ~ "Bio",
                                         Domain=="Engineering and\nComputational Sciences" ~ "Engineering",
                                         Domain=="Physical and\nMathematical Sciences" ~ "Physical/Math",
                                         Domain=="Social\nSciences" ~ "Social"))


num_Countries_df_Percentile <- num_Countries_df %>%
  subset(Percentile_Cutoff==Percentile_Cutoff_Criterion_) %>%
  dplyr::select(-c(Percentile_Cutoff)) %>%
  subset(Citation_Type==Inflation_Criterion_) %>%
  subset(Citation_Window==Citation_Window_Criterion_)%>%
  subset(Journal_Censoring==Journal_Censoring_Criterion_)%>%
  as.data.frame()

num_Countries_df_DOMAIN_Percentile <- num_Countries_df_DOMAIN %>%
  subset(Percentile_Cutoff==Percentile_Cutoff_Criterion_) %>%
  dplyr::select(-c(Percentile_Cutoff))%>%
  subset(Citation_Type==Inflation_Criterion_) %>%
  subset(Citation_Window==Citation_Window_Criterion_)%>%
  subset(Journal_Censoring==Journal_Censoring_Criterion_)%>%
  as.data.frame()

### Plot 2 | Number of Countries | Plot ############################# 

plot2A_Number_Countries <- ggplot(data=num_Countries_df_DOMAIN_Percentile,
                                aes(x=Year,y=Number_of_Countries))+
  #geom_point(alpha=0.1,aes(group=Discipline))+
  #geom_line(alpha=0.1,aes(group=Discipline))+
  #geom_violin(aes(group=Year))+
  
  # Example Domains
  geom_point(data=num_Countries_df_DOMAIN_Percentile,aes(colour=Domain))+
  geom_line(data=num_Countries_df_DOMAIN_Percentile,aes(colour=Domain))+
  geom_ribbon(data=num_Countries_df_DOMAIN_Percentile,
              aes(colour=Domain,
                  fill=Domain,
                  ymax=Number_of_Countries+Number_of_Countries_SE,
                  ymin=Number_of_Countries-Number_of_Countries_SE),
              alpha=0.2)+
  
  #stat_summary(fun.data = "mean_se", colour = "black", size = 1, aes(group=Year)) + 
  xlab("Years")+
  ylab("Number of Countries")+
  theme_bw()+
  # ggrepel::geom_label_repel(data=subset(num_Countries_df_DOMAIN_Percentile,
  #                                       Year==2012),
  #                           aes(fill=Domain,label=Domain,size=5),
  #                           fontface = 'bold', color = 'white',show.legend = F)+
  theme(text = element_text(size=16),
        panel.border = element_blank(), 
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(), 
        axis.line = element_line(colour = "black")) +
  theme(legend.text=element_text(size=8))+
  theme(legend.position="bottom")+
  theme(strip.background = element_blank(), strip.placement = "outside")+
  theme(legend.title=element_blank())+
  theme(legend.background = element_rect(colour = "black"))+
  guides(colour=guide_legend(nrow=2))+
  ggsci::scale_color_npg()+
  ggsci::scale_fill_npg()

# Plot 2 | Beta | Data =================================

Beta_df_Trends <- Beta_df %>%
  dplyr::group_by(Coefficients,Year,Country_Inclusion_Criteria,Percentile_Cutoff,Citation_Type,Citation_Window,Journal_Censoring,Model_Type) %>%
  dplyr::summarise(Beta_Mean = mean(Beta,na.rm=T),
                   Beta_SE = Grand_SE(Beta/Test_Statistics),
                   N=length(Beta)) %>%
  as.data.frame() %>%
  dplyr::rename("Beta"="Beta_Mean") %>%
  subset(Coefficients!="Intercept") %>%
  dplyr::mutate(Percentile_Cutoff_Labels = case_when(Percentile_Cutoff==0 ~ "All Inclusive",
                                                     Percentile_Cutoff==0.25 ~ "Above 25th\nPercentile",
                                                     Percentile_Cutoff==0.5 ~ "Above 50th\nPercentile",
                                                     Percentile_Cutoff==0.75 ~ "Above 75th\nPercentile"))

# Number of Networks 
# Beta_df %>% select(Discipline,Year) %>% unique() %>% dim()

Beta_df_Percentile <- Beta_df %>%
  subset(Percentile_Cutoff==Percentile_Cutoff_Criterion_) %>%
  subset(select=-c(Percentile_Cutoff)) %>%
  subset(Citation_Type==Inflation_Criterion_) %>%
  subset(Citation_Window==Citation_Window_Criterion_)%>%
  subset(Journal_Censoring==Journal_Censoring_Criterion_)%>%
  subset(Model_Type=="Citations_ji_and_NumDiffijPapers")%>%
  subset(Country_Inclusion_Criteria!='All_wo_Core')%>%
  dplyr::mutate(Country_Inclusion_Criteria = case_when(as.character(Country_Inclusion_Criteria)=="All"~"Core+Periphery",
                                                     TRUE~as.character(Country_Inclusion_Criteria)))%>%
  dplyr::mutate(Country_Inclusion_Criteria = factor(Country_Inclusion_Criteria,levels=c("Core+Periphery","Core")))%>%
 as.data.frame()
  
Beta_df_Trends_Percentile <- Beta_df_Trends %>%
  subset(Percentile_Cutoff==Percentile_Cutoff_Criterion_) %>%
  subset(select=-c(Percentile_Cutoff))%>%
  subset(Model_Type=="Citations_ji_and_NumDiffijPapers")%>%
  subset(Citation_Type==Inflation_Criterion_) %>%
  subset(Country_Inclusion_Criteria!='All_wo_Core')%>%
  subset(Citation_Window==Citation_Window_Criterion_)%>%
  subset(Journal_Censoring==Journal_Censoring_Criterion_)%>%
  dplyr::mutate(Country_Inclusion_Criteria = case_when(as.character(Country_Inclusion_Criteria)=="All"~"Core+Periphery",
                                                       TRUE~as.character(Country_Inclusion_Criteria)))%>%
  dplyr::mutate(Country_Inclusion_Criteria = factor(Country_Inclusion_Criteria,levels=c("Core+Periphery","Core")))%>%
  as.data.frame()

# Plot 2 | Beta | Plots =================================

plot2B_Beta_Coefficients <- ggplot(data=Beta_df_Percentile,
                                 aes(x=Year,
                                     y=Beta,
                                     colour=Country_Inclusion_Criteria))+
  #geom_point(aes(group=Discipline),alpha=0.02)+
  #geom_line(aes(group=Discipline),alpha=0.02)+
  #geom_errorbar(aes(ymin=Beta-(Beta/Test_Statistics),
  #                  ymax=Beta+(Beta/Test_Statistics),
  #                  group=Discipline),alpha=0.02)+
  
  #### Grand Trends
  geom_point(data=Beta_df_Trends_Percentile,
             aes(group=Country_Inclusion_Criteria))+
  geom_line(data=Beta_df_Trends_Percentile,
            aes(group=Country_Inclusion_Criteria))+
  geom_ribbon(data=Beta_df_Trends_Percentile,
              aes(ymin=Beta-Beta_SE,
                  ymax=Beta+Beta_SE,
                  colour=Country_Inclusion_Criteria,
                  fill=Country_Inclusion_Criteria),
              alpha=0.2)+
  # ggrepel::geom_label_repel(data=subset(Beta_df_Trends_Percentile,
  #                                       Year==2012),
  #                           aes(fill=Country_Inclusion_Criteria,label=Country_Inclusion_Criteria,size=5),
  #                           fontface = 'bold', color = 'white',show.legend = F)+
  theme_bw() + 
  #geom_vline(xintercept = 2005,color="red",linetype="dashed")+
  ylab("Beta Coefficient\n(Standard Deviations)")+
  xlab("Years")+
  #ylim(c(0.2,0.35))+ #Original c(0.15,0.45)
  theme(text = element_text(size=16),
        panel.border = element_blank(), 
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(), 
        axis.line = element_line(colour = "black")) +
  theme(legend.position="bottom")+
  theme(strip.background = element_blank(), strip.placement = "outside")+
  theme(legend.title=element_blank())+
  theme(legend.background = element_rect(colour = "black"))+
  ggtitle("")+
  theme(plot.title = element_text(hjust = 0.5))+
  ggsci::scale_color_npg()+
  ggsci::scale_fill_npg()


# Plot 2 | Beta Domain | Data =================================


Beta_df_DOMAIN <- plyr::ldply(list.files(pattern="OUTPUT_Python_Field_MRQAP_Beta_KLD_Citation_Corpus_RAKE_and_GoogleAPI_EnglishOnly_*",
                                         full.names=T), 
                              read.csv)%>%
  subset(Stars!="" & 
           Year>=minimum_year & 
           Year<=(maximum_year-Citation_Window_Criterion_)) %>%
  merge(.,subset(MAG_Field_ID_df,select=c("FieldID","Domain")),by.x=c("Discipline"),by.y=c("FieldID"),how="left") %>%
  subset(Coefficients!="Intercept") %>%
  as.data.frame() %>%
  subset(Coefficients%in%c("Citations_ji"))%>%
  subset(Percentile_Cutoff %in% c(0,0.25,0.5,0.75)) %>%
  tidyr::drop_na()

Beta_df_DOMAIN_Trends <- Beta_df_DOMAIN %>%
  dplyr::group_by(Domain,Coefficients,Year,Country_Inclusion_Criteria,Percentile_Cutoff,Citation_Window,Citation_Type) %>%
  dplyr::summarise(Beta_Mean = mean(Beta,na.rm=T),
                   Beta_SE = Grand_SE(Beta/Test_Statistics)) %>%
  as.data.frame() %>%
  dplyr::rename("Beta"="Beta_Mean") %>%
  subset(Coefficients!="Intercept") %>%
  dplyr::mutate(Percentile_Cutoff_Labels = case_when(Percentile_Cutoff==0 ~ "All Inclusive",
                                                     Percentile_Cutoff==0.25 ~ "Above 25th\nPercentile",
                                                     Percentile_Cutoff==0.5 ~ "Above 50th\nPercentile",
                                                     Percentile_Cutoff==0.75 ~ "Above 75th\nPercentile"))


Beta_df_DOMAIN_Percentile <- Beta_df_DOMAIN %>%
  subset(Percentile_Cutoff==Percentile_Cutoff_Criterion_) %>%
  subset(select=-c(Percentile_Cutoff))%>%
  subset(Country_Inclusion_Criteria!="All_wo_Core") %>%
  subset(Citation_Type==Inflation_Criterion_) %>%
  subset(Citation_Window==Citation_Window_Criterion_)%>%
  dplyr::mutate(Country_Inclusion_Criteria = case_when(as.character(Country_Inclusion_Criteria)=="All"~"Core+Periphery",
                                                       TRUE~as.character(Country_Inclusion_Criteria)))%>%
  dplyr::mutate(Country_Inclusion_Criteria = factor(Country_Inclusion_Criteria,levels=c("Core+Periphery","Core")))%>%
  as.data.frame()

Beta_df_DOMAIN_Trends_Percentile <- Beta_df_DOMAIN_Trends %>%
  subset(Percentile_Cutoff==Percentile_Cutoff_Criterion_) %>%
  subset(select=-c(Percentile_Cutoff))%>%
  subset(Country_Inclusion_Criteria!="All_wo_Core") %>%
  subset(Citation_Type==Inflation_Criterion_) %>%
  subset(Citation_Window==Citation_Window_Criterion_)%>%
  dplyr::mutate(Country_Inclusion_Criteria = case_when(as.character(Country_Inclusion_Criteria)=="All"~"Core+Periphery",
                                                       TRUE~as.character(Country_Inclusion_Criteria)))%>%
  dplyr::mutate(Country_Inclusion_Criteria = factor(Country_Inclusion_Criteria,levels=c("Core+Periphery","Core")))%>%
  as.data.frame()

# Plot 2 | Beta Domain | Plot =================================

plot2C_Beta_Coefficients_DOMAIN <- ggplot(data=subset(Beta_df_DOMAIN_Percentile,
                                                    Year>=minimum_year & Year<=maximum_year & Model_Type=="Citations_ji"),
                                        aes(x=Year,y=Beta,colour=Country_Inclusion_Criteria))+

  #### Grand Trends
  geom_point(data=subset(Beta_df_DOMAIN_Trends_Percentile,Year>=minimum_year & Year<=maximum_year ),
             aes(group=Country_Inclusion_Criteria))+
  geom_line(data=subset(Beta_df_DOMAIN_Trends_Percentile,Year>=minimum_year & Year<=maximum_year ),
            aes(group=Country_Inclusion_Criteria))+
  geom_ribbon(data=subset(Beta_df_DOMAIN_Trends_Percentile,Year>=minimum_year & Year<=maximum_year),
              aes(ymin=Beta-Beta_SE,
                  ymax=Beta+Beta_SE,
                  fill=Country_Inclusion_Criteria,
                  color=Country_Inclusion_Criteria),
             alpha=0.2)+
  # ggrepel::geom_label_repel(data=subset(Beta_df_DOMAIN_Trends_Percentile,
  #                                       Year==2012),
  #                           aes(fill=Country_Inclusion_Criteria,label=Country_Inclusion_Criteria,size=5),
  #                           fontface = 'bold', color = 'white',show.legend = F)+
  theme_bw() + 
  ylab("Beta Coefficient\n(Standard Deviations)")+
  xlab("Years")+
  #ylim(c(0.1,0.4))+
  theme(text = element_text(size=16),
        panel.border = element_blank(), 
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(), 
        axis.line = element_line(colour = "black")) +
  theme(legend.position="bottom")+
  theme(strip.background = element_blank(), strip.placement = "outside")+
  theme(legend.title=element_blank())+
  theme(legend.background = element_rect(colour = "black"))+
  ggtitle("")+
  theme(plot.title = element_text(hjust = 0.5))+
  ggsci::scale_color_npg()+
  ggsci::scale_fill_npg()+
  facet_grid(~Domain)


# Plot 2 | Combine Plots =================================


plot_2ab <- ggpubr::ggarrange(plot2A_Number_Countries,
                              plot2B_Beta_Coefficients,
                              labels=c("A","B"),
                              nrow=1,
                              ncol=2)

plot_2c <- ggpubr::ggarrange(plot2C_Beta_Coefficients_DOMAIN,
                             labels=c("C"),
                             nrow=1,
                             ncol=1)

plot_2 <- ggpubr::ggarrange(plot_2ab,
                            plot_2c,
                            nrow=2,
                            ncol=1)

# Plot 2 | Combine Plots | Print =================================

pdf(paste("NEW_FIGURE_R_Plot2.pdf",sep=""),width=(6*1.75), height=6*2)
plot(plot_2)
dev.off()


